EDA Practice
EDA¶
PACKAGES¶
import pandas as pd
import numpy as np
import pandas_profiling as pp
LOAD DATA¶
df = pd.read_csv("http://www.ishelp.info/data/insurance.csv")
SUMMERY_OF_DATA¶
from autoviz.AutoViz_Class import AutoViz_Class
AV = AutoViz_Class()
df_av = AV.AutoViz('http://www.ishelp.info/data/insurance.csv')
pp.ProfileReport(df)
df.head()
df.describe()
df.shape
df.columns
print (f'age: {df.age.count()}')
print (f'sex: {df.sex.count()}')
print (f'bmi: {df.bmi.count()}')
print (f'children: {df.children.count()}')
print (f'smoker: {df.smoker.count()}')
print (f'region: {df.region.count()}')
print (f'charges: {df.charges.count()}')
missing_value¶
print (f'age: {df.age.nunique()}')
print (f'sex: {df.sex.nunique()}')
print (f'bmi: {df.bmi.nunique()}')
print (f'children: {df.children.nunique()}')
print (f'smoker: {df.smoker.nunique()}')
print (f'region: {df.region.nunique()}')
print (f'charges: {df.charges.nunique()}')
typing¶
print (f'age : {df.age.dtype}')
print (f'sex: {df.sex.dtype}')
print (f'bmi: {df.bmi.dtype}')
print (f'children: {df.children.dtype}')
print (f'smoker: {df.smoker.dtype}')
print (f'region: {df.region.dtype}')
print (f'charges: {df.charges.dtype}')
isnull¶
print (f'age : {df.age.isnull().sum()}')
print (f'sex: {df.sex.isnull().sum()}')
print (f'bmi: {df.bmi.isnull().sum()}')
print (f'children: {df.children.isnull().sum()}')
print (f'smoker: {df.smoker.isnull().sum()}')
print (f'region: {df.region.isnull().sum()}')
print (f'charges: {df.charges.isnull().sum()}')
for col in df :
print(df.isnull().sum())
df.values
LABEL_ENCODER¶
#replaced_base['Ward']=dropped_base['Ward'].replace(['BMT1', 'BMT2', "BMT3","BMT4"],
#[0, 1, 2 , 3], inplace=False)
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df["sex"] = le.fit_transform(df["sex"])
df
for col in df.columns:
if df.dtypes[col] == 'object':
print(col)
#base[col] = le.fit_transform(base[col])
dropped_df=df.dropna(axis=1, how='all', thresh=200, subset=None, inplace=False)
dropped_df
Importing Needed packages¶
import matplotlib.pyplot as plt
import pandas as pd
import pylab as pl
import numpy as np
%matplotlib inline
Downloading Data¶
!wget -O FuelConsumption.csv https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillsNetwork/labs/Module%202/data/FuelConsumptionCo2.csv
Reading the data in¶
df = pd.read_csv("FuelConsumption.csv")
df.head()
# take a look at the dataset
Data Exploration¶
Let's first have a descriptive exploration on our data
# summarize the data
df.describe()
Let's select some features to explore more.
cdf = df[['ENGINESIZE','CYLINDERS','FUELCONSUMPTION_COMB','CO2EMISSIONS']]
cdf.head(9)
We can plot each of these features:
viz = cdf[['CYLINDERS','ENGINESIZE','CO2EMISSIONS','FUELCONSUMPTION_COMB']]
viz.hist()
plt.show()
Now, let's plot each of these features against the Emission, to see how linear their relationship is:
plt.scatter(cdf.FUELCONSUMPTION_COMB, cdf.CO2EMISSIONS, color='blue')
plt.xlabel("FUELCONSUMPTION_COMB")
plt.ylabel("Emission")
plt.show()